newBible = readRDS("newBible.rda")
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.3.2
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.3.2
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Warning: package 'tibble' was built under R version 3.3.2
## Warning: package 'tidyr' was built under R version 3.3.2
## Warning: package 'readr' was built under R version 3.3.2
## Warning: package 'purrr' was built under R version 3.3.2
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 3.3.2
versions = c('kjb', 'asv', 'drb', 'erv', 'wbt', 'web', 'ylt', 'akjv', 'wnt')
newBible$text = as.character(newBible$text)
unnested = newBible %>%
unnest_tokens(word, text)
data(stop_words)
stopUnnest = unnested %>%
anti_join(stop_words)
## Joining, by = "word"
stopUnnest %>%
count(word, sort = TRUE) %>%
filter(n > 10000) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
theme_solarized(light = TRUE) +
ggtitle("Unique Word Frequency") +
xlab("Frequency") +
ylab("Count") +
theme(axis.text.y = element_text(angle = 40, hjust = 1),
text = element_text(size = 10))
## Warning: package 'bindrcpp' was built under R version 3.3.2

# stopUnnest %>%
# count(word, sort = TRUE) %>%
# filter(n > 600) %>%
# mutate(word = reorder(word, n)) %>%
# ggplot(aes(word, n)) +
# geom_col() +
# xlab(NULL) +
# coord_flip()
bibleSentiment = stopUnnest %>%
inner_join(get_sentiments("bing")) %>%
count(bookName, index = chapter, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
bibleSentiment$direction = ifelse(bibleSentiment$sentiment >= 0, "Positive", "Negative")
ggplot(bibleSentiment, aes(reorder(bookName, sentiment, FUN = mean), sentiment)) +
geom_col() +
xlab(NULL) +
coord_flip() +
theme_solarized(light = TRUE) +
ggtitle("Unique Word Frequency") +
xlab("Frequency") +
ylab("Count") +
theme(axis.text.y = element_text(angle = 40, hjust = 1),
text = element_text(size = 7))

gg=ggplot(bibleSentiment %>%
filter(bookName %in% c("Matthew",
"Mark",
"Psalm",
"Genesis",
"Exodus",
"Joshua",
"Revelation")),
aes(index, sentiment, fill = direction, colour = direction)) +
geom_col(show.legend = FALSE) +
# geom_text(aes(label=direction), vjust=0)
geom_bar(stat="identity",
position="identity",
colour="black",
size = 0.5) +
facet_wrap(~bookName, ncol = 2, scales = "free_x")
plotly::ggplotly(gg)
library(wordcloud)
## Loading required package: RColorBrewer
stopUnnest %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))

library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
stopUnnest %>%
inner_join(get_sentiments("bing")) %>%
filter(bookName == "Matthew") %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("#F8766D", "#00BFC4"),
max.words = 50)
## Joining, by = "word"

book_words <- stopUnnest %>%
count(bookName, word, sort = TRUE) %>%
ungroup()
total_words <- book_words %>%
group_by(bookName) %>%
summarize(total = sum(n))
book_words <- left_join(book_words, total_words)
## Joining, by = "bookName"
freq_by_rank <- book_words %>%
group_by(bookName) %>%
mutate(rank = row_number(),
`term frequency` = n/total)
freq_by_rank
ggplot(book_words %>%
filter(bookName == c("Genesis",
"Matthew",
"Exodus",
"Leviticus")), aes(n/total, fill = bookName)) +
geom_histogram(show.legend = FALSE) +
xlim(NA, 0.0009) +
facet_wrap(~bookName, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 210 rows containing non-finite values (stat_bin).

freq_by_rank <- book_words %>%
group_by(bookName) %>%
mutate(rank = row_number(),
`term frequency` = n/total)
freq_by_rank
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = bookName)) +
geom_line(size = 1.2, alpha = 0.8) +
scale_x_log10() +
scale_y_log10()

rank_subset <- freq_by_rank %>%
filter(rank < 500,
rank > 10)
lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
##
## Call:
## lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
##
## Coefficients:
## (Intercept) log10(rank)
## -1.016 -0.866
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = bookName)) +
geom_abline(intercept = -0.62, slope = -1.1, color = "gray50", linetype = 2) +
geom_line(size = 1.2, alpha = 0.8) +
scale_x_log10() +
scale_y_log10()
